1 First step

The ALICIA - Concytec page was used to search for undergraduate theses with the following search strategy:

(“Análisis Factorial” OR “Análisis de Componentes Principales” OR “ACP” OR “confiabilidad” OR “fiabilidad” OR “análisis psicométrico” OR “baremación”) OR ((“validez” OR “validación” OR “adaptación” OR “construcción” OR “estandarización”) AND (“escala” OR “batería de pruebas” OR “prueba psicológica” OR “instrumento” OR “cuestionario” OR “test”))

In addition, the following filters were used:

  • Year from 2011 to 2020
  • Undergraduate thesis
  • Open Access thesis

All this is kept in a static link that will be used to inspect and store the data in an excel file.

library(rvest)
library(tidyverse)

url <- "https://alicia.concytec.gob.pe/vufind/Search/Results?filter%5B%5D=format%3A%22bachelorThesis%22&filter%5B%5D=eu_rights_str_mv%3A%22openAccess%22&lookfor=%28%E2%80%9CAn%C3%A1lisis+Factorial%E2%80%9D+OR+%E2%80%9CAn%C3%A1lisis+de+Componentes+Principales%E2%80%9D+OR+%E2%80%9CACP%E2%80%9D+OR+%E2%80%9Cconfiabilidad%E2%80%9D+OR+%E2%80%9Cfiabilidad%E2%80%9D+OR+%E2%80%9Can%C3%A1lisis+psicom%C3%A9trico%E2%80%9D+OR+%22baremaci%C3%B3n%22%29+OR+%28%28%E2%80%9Cvalidez%E2%80%9D+OR+%22validaci%C3%B3n%22+OR+%22adaptaci%C3%B3n%22+OR+%22construcci%C3%B3n%22+OR+%22estandarizaci%C3%B3n%22%29+AND+%28%22escala%22+OR+%E2%80%9Cbater%C3%ADa+de+pruebas%E2%80%9D+OR+%E2%80%9Cprueba+psicol%C3%B3gica%E2%80%9D+OR+%22instrumento%22+OR+%22cuestionario%22+OR+%22test%22%29%29&type=AllFields&daterange%5B%5D=publishDate&publishDatefrom=2011&publishDateto=2020"

dina_html <- read_html(url) 

1.1 Determinate the number of pages

last_n_page
[1] "811"

2 Loop for read all thesis

2.1 Read html of each thesis

read_html_thesis <- vector("list", nrow(complete_href_dina))

for (i in seq_len(nrow(complete_href_dina))) {
  if (!(complete_href_dina$tesis_url[i] %in% names(read_html_thesis))) {
    cat(paste("Doing thesis number", i, "..."))
    ok <- FALSE
    counter <- 0
    while (ok == FALSE & counter <= 10) {
      counter <- counter + 1
      out <- tryCatch(
        expr = {
          complete_href_dina$tesis_url[i] %>% 
            read_html()
        },
        error = function(e) {
          Sys.sleep(2)
          e
        }
      )
      if ("error" %in% class(out)) {
        cat(".")
      } else {
        ok <- TRUE
        cat("Successful!")
      }
    }
    cat("\n")
    read_html_thesis[[i]] <- out
    names(read_html_thesis)[i] <- complete_href_dina$tesis_url[i]
  }
}

2.2 Extract information

This function help to extract information about thesis like title, abstract, etc.

extract_information <- function(html) {
  titulo <- html %>% 
    html_elements(".media-body h1") %>% 
    html_text2() %>% 
    tibble(Titulo = .)
  
  info_ident <- html %>%  
    html_table() %>% 
    magrittr::extract2(1) %>% 
    mutate(X1 = str_remove(X1, ":")) %>% 
    pivot_wider(
      names_from = X1,
      values_from = X2
    )
  
  resumen <- html %>%  
    html_table() %>% 
    magrittr::extract2(2) %>% 
    mutate(X1 = str_remove(X1, ":")) %>% 
    pivot_wider(
      names_from = X1,
      values_from = X2
    )
  
  information <- bind_cols(
    info_ident,
    titulo,
    resumen
  ) %>% 
    relocate(Titulo, .after = "Autor Principal")
  
  return(information)
}
thesis_information <- vector("list", length(read_html_thesis))

for (i in seq_len(length(read_html_thesis))) {
  thesis_information[[i]] <- extract_information(read_html_thesis[[i]])
}

Join full information about the thesis

2.3 Remove duplicates

The table has 16211 at the moment.

thesis_final <- thesis_information %>% 
  mutate(
    titulo_tmp = str_to_upper(Titulo)
  ) %>%
  distinct(titulo_tmp, .keep_all = TRUE) %>% 
  select(-c(titulo_tmp))

Now, it has 15150.

3 Export to XLSX

openxlsx::write.xlsx(thesis_information,
                     "Table complet thesis psychometric.xlsx")
LS0tCnRpdGxlOiAiSS4gV2ViIFNjcmFwcGluZyBvZiBESU5BIC0gQ29uY3l0ZWMiCmF1dGhvcjogIkJyaWFuIE4uIFBlw7FhLUNhbGVybyIKZGF0ZTogIjIzLzUvMjAyMSIKb3V0cHV0OiAKICBodG1sX25vdGVib29rOiAKICAgIG51bWJlcl9zZWN0aW9uczogeWVzCiAgICB0b2M6IHllcwogICAgdG9jX2Zsb2F0OiB5ZXMKICAgIGhpZ2hsaWdodDoga2F0ZQogICAgdGhlbWU6IGZsYXRseQplZGl0b3Jfb3B0aW9uczogCiAgY2h1bmtfb3V0cHV0X3R5cGU6IGlubGluZQotLS0KCiMgRmlyc3Qgc3RlcAoKVGhlIFtBTElDSUEgLSBDb25jeXRlYyBwYWdlXShodHRwczovL2FsaWNpYS5jb25jeXRlYy5nb2IucGUpIHdhcyB1c2VkIHRvIHNlYXJjaCBmb3IgdW5kZXJncmFkdWF0ZSB0aGVzZXMgd2l0aCB0aGUgZm9sbG93aW5nIHNlYXJjaCBzdHJhdGVneToKCj4gKCJBbsOhbGlzaXMgRmFjdG9yaWFsIiBPUiAiQW7DoWxpc2lzIGRlIENvbXBvbmVudGVzIFByaW5jaXBhbGVzIiBPUiAiQUNQIiBPUiAiY29uZmlhYmlsaWRhZCIgT1IgImZpYWJpbGlkYWQiIE9SICJhbsOhbGlzaXMgcHNpY29tw6l0cmljbyIgT1IgImJhcmVtYWNpw7NuIikgT1IgKCgidmFsaWRleiIgT1IgInZhbGlkYWNpw7NuIiBPUiAiYWRhcHRhY2nDs24iIE9SICJjb25zdHJ1Y2Npw7NuIiBPUiAiZXN0YW5kYXJpemFjacOzbiIpIEFORCAoImVzY2FsYSIgT1IgImJhdGVyw61hIGRlIHBydWViYXMiIE9SICJwcnVlYmEgcHNpY29sw7NnaWNhIiBPUiAiaW5zdHJ1bWVudG8iIE9SICJjdWVzdGlvbmFyaW8iIE9SICJ0ZXN0IikpCgpJbiBhZGRpdGlvbiwgdGhlIGZvbGxvd2luZyBmaWx0ZXJzIHdlcmUgdXNlZDoKCi0gWWVhciBmcm9tIDIwMTEgdG8gMjAyMAotIFVuZGVyZ3JhZHVhdGUgdGhlc2lzCi0gT3BlbiBBY2Nlc3MgdGhlc2lzCgpBbGwgdGhpcyBpcyBrZXB0IGluIGEgc3RhdGljIGxpbmsgdGhhdCB3aWxsIGJlIHVzZWQgdG8gaW5zcGVjdCBhbmQgc3RvcmUgdGhlIGRhdGEgaW4gYW4gZXhjZWwgZmlsZS4KCmBgYHtyfQpsaWJyYXJ5KHJ2ZXN0KQpsaWJyYXJ5KHRpZHl2ZXJzZSkKCnVybCA8LSAiaHR0cHM6Ly9hbGljaWEuY29uY3l0ZWMuZ29iLnBlL3Z1ZmluZC9TZWFyY2gvUmVzdWx0cz9maWx0ZXIlNUIlNUQ9Zm9ybWF0JTNBJTIyYmFjaGVsb3JUaGVzaXMlMjImZmlsdGVyJTVCJTVEPWV1X3JpZ2h0c19zdHJfbXYlM0ElMjJvcGVuQWNjZXNzJTIyJmxvb2tmb3I9JTI4JUUyJTgwJTlDQW4lQzMlQTFsaXNpcytGYWN0b3JpYWwlRTIlODAlOUQrT1IrJUUyJTgwJTlDQW4lQzMlQTFsaXNpcytkZStDb21wb25lbnRlcytQcmluY2lwYWxlcyVFMiU4MCU5RCtPUislRTIlODAlOUNBQ1AlRTIlODAlOUQrT1IrJUUyJTgwJTlDY29uZmlhYmlsaWRhZCVFMiU4MCU5RCtPUislRTIlODAlOUNmaWFiaWxpZGFkJUUyJTgwJTlEK09SKyVFMiU4MCU5Q2FuJUMzJUExbGlzaXMrcHNpY29tJUMzJUE5dHJpY28lRTIlODAlOUQrT1IrJTIyYmFyZW1hY2klQzMlQjNuJTIyJTI5K09SKyUyOCUyOCVFMiU4MCU5Q3ZhbGlkZXolRTIlODAlOUQrT1IrJTIydmFsaWRhY2klQzMlQjNuJTIyK09SKyUyMmFkYXB0YWNpJUMzJUIzbiUyMitPUislMjJjb25zdHJ1Y2NpJUMzJUIzbiUyMitPUislMjJlc3RhbmRhcml6YWNpJUMzJUIzbiUyMiUyOStBTkQrJTI4JTIyZXNjYWxhJTIyK09SKyVFMiU4MCU5Q2JhdGVyJUMzJUFEYStkZStwcnVlYmFzJUUyJTgwJTlEK09SKyVFMiU4MCU5Q3BydWViYStwc2ljb2wlQzMlQjNnaWNhJUUyJTgwJTlEK09SKyUyMmluc3RydW1lbnRvJTIyK09SKyUyMmN1ZXN0aW9uYXJpbyUyMitPUislMjJ0ZXN0JTIyJTI5JTI5JnR5cGU9QWxsRmllbGRzJmRhdGVyYW5nZSU1QiU1RD1wdWJsaXNoRGF0ZSZwdWJsaXNoRGF0ZWZyb209MjAxMSZwdWJsaXNoRGF0ZXRvPTIwMjAiCgpkaW5hX2h0bWwgPC0gcmVhZF9odG1sKHVybCkgCmBgYAoKIyMgRGV0ZXJtaW5hdGUgdGhlIG51bWJlciBvZiBwYWdlcwoKCmBgYHtyfQpsYXN0X25fcGFnZSA8LSBkaW5hX2h0bWwgJT4lIAogIGh0bWxfZWxlbWVudHMoIi5wYWdpbmF0aW9uIGxpOmxhc3QtY2hpbGQiKSAlPiUgCiAgaHRtbF90ZXh0MigpICU+JSAKICBzdHJfZXh0cmFjdCgiWzAtOV0rIikKCmxhc3Rfbl9wYWdlCmBgYAoKIyMgTG9vcCBmb3IgZXh0cmFjdCBsaW5rcyBpbiBldmVyeSBwYWdlcyBhdmFpYmxlCgpgYGB7cn0Kb3B0aW9ucyh0aW1lb3V0PSAxZSsxMCkKYGBgCgpgYGB7cn0KZ3JvdXBfcGFnZXMgPC0gc3BsaXQoc2VxX2xlbihsYXN0X25fcGFnZSksIGNlaWxpbmcoc2VxX2xlbihsYXN0X25fcGFnZSkvNTApKQoKdG1wIDwtIGxpc3QoKQpocmVmX2RpbmEgPC0gbGlzdCgpCgpmb3IgKGkgaW4gc2VxX2Fsb25nKGdyb3VwX3BhZ2VzKSkgewogIGZvciAoaiBpbiBncm91cF9wYWdlc1tbaV1dKSB7CiAgICB0bXBbW2pdXSA8LSBwYXN0ZTAodXJsLCAiJnBhZ2U9IiwgaikKCiAgICBocmVmX2RpbmFbW2pdXSA8LSB0bXBbW2pdXSAlPiUKICAgICAgcmVhZF9odG1sKCkgJT4lCiAgICAgIGh0bWxfZWxlbWVudHMoIi5yZXN1bHQgLnJvdyAubGluayBhIikgJT4lCiAgICAgIGh0bWxfYXR0cnMoKSAlPiUKICAgICAgdW5saXN0KCkgJT4lCiAgICAgIGFzX3RpYmJsZSgpCgogICAgcHJpbnQoaikKICB9CiAgU3lzLnNsZWVwKDEyMCkKfQpgYGAKCmBgYHtyfQpjb21wbGV0ZV9ocmVmX2RpbmEgPC0gaHJlZl9kaW5hICU+JSAKICBiaW5kX3Jvd3MoKSAlPiUgCiAgbXV0YXRlKAogICAgdGVzaXNfdXJsID0gcGFzdGUwKCJodHRwczovL2FsaWNpYS5jb25jeXRlYy5nb2IucGUiLAogICAgICAgICAgICAgICAgICAgICAgIHZhbHVlKQogICkgJT4lIAogIHNlbGVjdCgtdmFsdWUpCgpjb21wbGV0ZV9ocmVmX2RpbmEKYGBgCgojIExvb3AgZm9yIHJlYWQgYWxsIHRoZXNpcyAKCiMjIFJlYWQgaHRtbCBvZiBlYWNoIHRoZXNpcwoKYGBge3J9CnJlYWRfaHRtbF90aGVzaXMgPC0gdmVjdG9yKCJsaXN0IiwgbnJvdyhjb21wbGV0ZV9ocmVmX2RpbmEpKQoKZm9yIChpIGluIHNlcV9sZW4obnJvdyhjb21wbGV0ZV9ocmVmX2RpbmEpKSkgewogIGlmICghKGNvbXBsZXRlX2hyZWZfZGluYSR0ZXNpc191cmxbaV0gJWluJSBuYW1lcyhyZWFkX2h0bWxfdGhlc2lzKSkpIHsKICAgIGNhdChwYXN0ZSgiRG9pbmcgdGhlc2lzIG51bWJlciIsIGksICIuLi4iKSkKICAgIG9rIDwtIEZBTFNFCiAgICBjb3VudGVyIDwtIDAKICAgIHdoaWxlIChvayA9PSBGQUxTRSAmIGNvdW50ZXIgPD0gMTApIHsKICAgICAgY291bnRlciA8LSBjb3VudGVyICsgMQogICAgICBvdXQgPC0gdHJ5Q2F0Y2goCiAgICAgICAgZXhwciA9IHsKICAgICAgICAgIGNvbXBsZXRlX2hyZWZfZGluYSR0ZXNpc191cmxbaV0gJT4lIAogICAgICAgICAgICByZWFkX2h0bWwoKQogICAgICAgIH0sCiAgICAgICAgZXJyb3IgPSBmdW5jdGlvbihlKSB7CiAgICAgICAgICBTeXMuc2xlZXAoMikKICAgICAgICAgIGUKICAgICAgICB9CiAgICAgICkKICAgICAgaWYgKCJlcnJvciIgJWluJSBjbGFzcyhvdXQpKSB7CiAgICAgICAgY2F0KCIuIikKICAgICAgfSBlbHNlIHsKICAgICAgICBvayA8LSBUUlVFCiAgICAgICAgY2F0KCJTdWNjZXNzZnVsISIpCiAgICAgIH0KICAgIH0KICAgIGNhdCgiXG4iKQogICAgcmVhZF9odG1sX3RoZXNpc1tbaV1dIDwtIG91dAogICAgbmFtZXMocmVhZF9odG1sX3RoZXNpcylbaV0gPC0gY29tcGxldGVfaHJlZl9kaW5hJHRlc2lzX3VybFtpXQogIH0KfQpgYGAKCgojIyBFeHRyYWN0IGluZm9ybWF0aW9uCgpUaGlzIGZ1bmN0aW9uIGhlbHAgdG8gZXh0cmFjdCBpbmZvcm1hdGlvbiBhYm91dCB0aGVzaXMgbGlrZSB0aXRsZSwgYWJzdHJhY3QsIGV0Yy4KCmBgYHtyfQpleHRyYWN0X2luZm9ybWF0aW9uIDwtIGZ1bmN0aW9uKGh0bWwpIHsKICB0aXR1bG8gPC0gaHRtbCAlPiUgCiAgICBodG1sX2VsZW1lbnRzKCIubWVkaWEtYm9keSBoMSIpICU+JSAKICAgIGh0bWxfdGV4dDIoKSAlPiUgCiAgICB0aWJibGUoVGl0dWxvID0gLikKICAKICBpbmZvX2lkZW50IDwtIGh0bWwgJT4lICAKICAgIGh0bWxfdGFibGUoKSAlPiUgCiAgICBtYWdyaXR0cjo6ZXh0cmFjdDIoMSkgJT4lIAogICAgbXV0YXRlKFgxID0gc3RyX3JlbW92ZShYMSwgIjoiKSkgJT4lIAogICAgcGl2b3Rfd2lkZXIoCiAgICAgIG5hbWVzX2Zyb20gPSBYMSwKICAgICAgdmFsdWVzX2Zyb20gPSBYMgogICAgKQogIAogIHJlc3VtZW4gPC0gaHRtbCAlPiUgIAogICAgaHRtbF90YWJsZSgpICU+JSAKICAgIG1hZ3JpdHRyOjpleHRyYWN0MigyKSAlPiUgCiAgICBtdXRhdGUoWDEgPSBzdHJfcmVtb3ZlKFgxLCAiOiIpKSAlPiUgCiAgICBwaXZvdF93aWRlcigKICAgICAgbmFtZXNfZnJvbSA9IFgxLAogICAgICB2YWx1ZXNfZnJvbSA9IFgyCiAgICApCiAgCiAgaW5mb3JtYXRpb24gPC0gYmluZF9jb2xzKAogICAgaW5mb19pZGVudCwKICAgIHRpdHVsbywKICAgIHJlc3VtZW4KICApICU+JSAKICAgIHJlbG9jYXRlKFRpdHVsbywgLmFmdGVyID0gIkF1dG9yIFByaW5jaXBhbCIpCiAgCiAgcmV0dXJuKGluZm9ybWF0aW9uKQp9CmBgYAoKYGBge3J9CnRoZXNpc19pbmZvcm1hdGlvbiA8LSB2ZWN0b3IoImxpc3QiLCBsZW5ndGgocmVhZF9odG1sX3RoZXNpcykpCgpmb3IgKGkgaW4gc2VxX2xlbihsZW5ndGgocmVhZF9odG1sX3RoZXNpcykpKSB7CiAgdGhlc2lzX2luZm9ybWF0aW9uW1tpXV0gPC0gZXh0cmFjdF9pbmZvcm1hdGlvbihyZWFkX2h0bWxfdGhlc2lzW1tpXV0pCn0KYGBgCgpKb2luIGZ1bGwgaW5mb3JtYXRpb24gYWJvdXQgdGhlIHRoZXNpcwoKYGBge3J9CnRoZXNpc19pbmZvcm1hdGlvbiA8LSBiaW5kX3Jvd3ModGhlc2lzX2luZm9ybWF0aW9uKSAlPiUgCiAgcmVsb2NhdGUoYE90cm9zIEF1dG9yZXNgLCAuYWZ0ZXIgPSAiQXV0b3IgUHJpbmNpcGFsIikKCnRoZXNpc19pbmZvcm1hdGlvbgpgYGAKCiMjIFJlbW92ZSBkdXBsaWNhdGVzCgpUaGUgdGFibGUgaGFzIGByIG5yb3codGhlc2lzX2luZm9ybWF0aW9uKWAgYXQgdGhlIG1vbWVudC4KCmBgYHtyfQp0aGVzaXNfZmluYWwgPC0gdGhlc2lzX2luZm9ybWF0aW9uICU+JSAKICBtdXRhdGUoCiAgICB0aXR1bG9fdG1wID0gc3RyX3RvX3VwcGVyKFRpdHVsbykKICApICU+JQogIGRpc3RpbmN0KHRpdHVsb190bXAsIC5rZWVwX2FsbCA9IFRSVUUpICU+JSAKICBzZWxlY3QoLWModGl0dWxvX3RtcCkpCmBgYAoKTm93LCBpdCBoYXMgYHIgbnJvdyh0aGVzaXNfZmluYWwpYC4KCiMgRXhwb3J0IHRvIFhMU1gKCmBgYHtyfQpvcGVueGxzeDo6d3JpdGUueGxzeCh0aGVzaXNfZmluYWwsCiAgICAgICAgICAgICAgICAgICAgICJUYWJsZSBjb21wbGV0IHRoZXNpcyBwc3ljaG9tZXRyaWMueGxzeCIpCmBgYAo=